Call Libraries
library(tidyverse)
library(car)
library(moments)
library(glmnet)
Calling the Transformed Datasets
income_cleaned = read_csv('Shiny_app/data/income_cleaned.csv')
Rows: 1921 Columns: 5── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (3): Year, Num, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
industry_cleaned = read_csv('Shiny_app/data/industry_cleaned.csv')
Rows: 2476 Columns: 5── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (3): Year, Num, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Creating the Models
sat.model.summary <- function (df, field, sat.formula){
#Shapiro-Wilks test to evaluate normality
print(shapiro.test(df[[field]]))
#Kurtosis evaluation (normal distribution has a value close to 3)
print('kurtosis')
print(kurtosis(df[[field]]))
linear.model.cleaned = lm(sat.formula, data = df)
print(summary(linear.model.cleaned))
plot(linear.model.cleaned)
#histograms of response variable to check distribution
print(df %>%
ggplot(aes_string(field)) +
geom_histogram() +
labs(title = 'Average Credit Amount Distribution') +
theme(plot.title = element_text(hjust = 0.5)))
#Checking multicollinearity using VIF measurement
print(vif(linear.model.cleaned))
influencePlot(linear.model.cleaned)
#avPlots(linear.model.cleaned)
}
sat.formula <- Avg ~ .
sat.field <- 'Avg'
sat.model.summary(income_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.17297, p-value < 2.2e-16
[1] "kurtosis"
[1] 169.7518
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11338570 -547702 94139 421302 87181761
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.157e+08 5.512e+07 -2.100 0.03590 *
Year 5.721e+04 2.731e+04 2.095 0.03634 *
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -3.287e+05 1.242e+06 -0.265 0.79140
NameAlternative Minimum Tax Credit 6.538e+05 9.757e+05 0.670 0.50286
NameBeer Production Credit 3.316e+05 1.290e+06 0.257 0.79715
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.429e+06 9.960e+05 1.434 0.15162
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.098e+06 1.421e+06 2.883 0.00398 **
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.182e+06 9.927e+05 1.191 0.23386
NameBrownfield Tax Credits - Remediation Real Property Tax Credit -7.587e+04 9.920e+05 -0.076 0.93904
NameClean Heating Fuel Credit 7.148e+04 1.024e+06 0.070 0.94436
NameConservation Easement Tax Credit 1.143e+05 1.064e+06 0.107 0.91444
NameCredit for Employment of Persons with Disabilities -9.343e+05 1.119e+06 -0.835 0.40391
NameCredit for Purchase of an Automated External Defibrillator -1.449e+05 9.695e+05 -0.150 0.88117
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.316e+05 1.779e+06 0.130 0.89645
NameEmpire State Apprentice Tax Credit -7.983e+05 1.524e+06 -0.524 0.60040
NameEmpire State Commercial Production Credit 2.183e+05 1.291e+06 0.169 0.86576
NameEmpire State Film Post Production Credit 2.521e+04 1.049e+06 0.024 0.98082
NameEmpire State Film Production Credit 1.145e+07 9.967e+05 11.485 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit -2.342e+05 1.775e+06 -0.132 0.89507
NameExcelsior Jobs Program Credit 1.035e+05 9.545e+05 0.108 0.91366
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 2.752e+06 8.862e+05 3.105 0.00193 **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 4.845e+05 9.213e+05 0.526 0.59899
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.245e+06 9.239e+05 1.348 0.17785
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners -1.025e+05 9.549e+05 -0.107 0.91455
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -6.145e+04 9.387e+05 -0.065 0.94781
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 9.119e+04 1.044e+06 0.087 0.93038
NameFarm Workforce Retention Credit -7.802e+03 1.285e+06 -0.006 0.99516
NameFarmers' School Tax Credit 1.440e+05 1.021e+06 0.141 0.88786
NameHire a Veteran Credit -1.238e+06 1.782e+06 -0.695 0.48721
NameHistoric Properties Rehabilitation Credit 1.352e+06 1.055e+06 1.282 0.20015
NameIndustrial or Manufacturing Business Tax Credit 4.997e+05 1.074e+06 0.465 0.64173
NameInvestment Tax Credit 4.463e+05 8.895e+05 0.502 0.61588
NameInvestment Tax Credit for the Financial Services Industry 3.976e+05 1.008e+06 0.395 0.69325
NameLife Sciences Research & Development Tax Credit -8.207e+04 2.099e+06 -0.039 0.96882
NameLong-Term Care Insurance Credit 1.246e+05 9.808e+05 0.127 0.89892
NameLow-Income Housing Credit -2.764e+04 1.113e+06 -0.025 0.98020
NameMinimum Wage Reimbursement Credit -2.931e+05 1.006e+06 -0.291 0.77075
NameMortgage Servicing Tax Credit -4.540e+05 1.085e+06 -0.418 0.67565
NameNew York Youth Jobs Program Tax Credit -2.587e+05 9.381e+05 -0.276 0.78279
NameQETC Capital Tax Credit 2.616e+05 1.386e+06 0.189 0.85032
NameQETC Employment Credit 1.335e+05 1.026e+06 0.130 0.89641
NameQETC Facilities, Operations, and Training Credit 5.153e+05 1.918e+06 0.269 0.78820
NameReal Property Tax Relief Credit for Manufacturing -2.635e+05 9.654e+05 -0.273 0.78490
NameSpecial Additional Mortgage Recording Tax Credit 3.678e+04 9.244e+05 0.040 0.96827
NameSTART-UP NY Tax Elimination Credit 4.061e+03 1.130e+06 0.004 0.99713
Group1,000,000 - 24,999,999 2.170e+05 3.501e+05 0.620 0.53548
Group100,000 - 499,999 1.352e+05 3.579e+05 0.378 0.70572
Group100,000,000 - 499,999,999 9.644e+05 3.937e+05 2.449 0.01441 *
Group25,000,000 - 49,999,999 4.519e+05 4.223e+05 1.070 0.28474
Group50,000,000 - 99,999,999 4.021e+05 4.235e+05 0.950 0.34247
Group500,000 - 999,999 2.195e+05 3.880e+05 0.566 0.57170
Group500,000,000 - and over 3.073e+06 3.846e+05 7.991 2.33e-15 ***
GroupZero or Net Loss 9.536e+05 3.339e+05 2.856 0.00433 **
Num -4.520e+02 8.569e+02 -0.527 0.59791
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3868000 on 1867 degrees of freedom
Multiple R-squared: 0.2361, Adjusted R-squared: 0.2144
F-statistic: 10.88 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
income.model <- lm(sat.formula, data = income_cleaned)
sat.model.summary(industry_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.22287, p-value < 2.2e-16
[1] "kurtosis"
[1] 135.5482
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11581741 -371031 -23164 154182 28917959
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.926e+07 2.051e+07 -1.914 0.055677 .
Year 1.936e+04 1.016e+04 1.905 0.056959 .
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 5.584e+04 5.062e+05 0.110 0.912180
NameAlternative Minimum Tax Credit 3.523e+05 4.208e+05 0.837 0.402556
NameBeer Production Credit 8.799e+04 6.261e+05 0.141 0.888249
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.919e+06 4.473e+05 4.290 1.86e-05 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.064e+06 7.278e+05 5.584 2.62e-08 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.054e+06 4.779e+05 2.206 0.027462 *
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 1.327e+05 4.760e+05 0.279 0.780429
NameClean Heating Fuel Credit 2.785e+05 4.595e+05 0.606 0.544549
NameConservation Easement Tax Credit 2.743e+04 4.700e+05 0.058 0.953457
NameCredit for Employment of Persons with Disabilities 1.574e+05 5.069e+05 0.311 0.756142
NameCredit for Purchase of an Automated External Defibrillator 1.082e+05 4.465e+05 0.242 0.808639
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.090e+05 8.286e+05 0.252 0.800866
NameEmpire State Apprentice Tax Credit -3.324e+05 1.012e+06 -0.329 0.742537
NameEmpire State Commercial Production Credit 3.524e+05 6.172e+05 0.571 0.568123
NameEmpire State Film Post Production Credit 5.332e+05 5.224e+05 1.021 0.307585
NameEmpire State Film Production Credit 1.170e+07 4.785e+05 24.458 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 7.676e+04 9.010e+05 0.085 0.932112
NameExcelsior Jobs Program Credit 7.086e+05 4.393e+05 1.613 0.106849
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.381e+06 4.318e+05 3.199 0.001399 **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 3.715e+05 4.213e+05 0.882 0.377942
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.160e+06 4.192e+05 2.767 0.005700 **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 3.634e+05 4.311e+05 0.843 0.399359
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 2.567e+05 4.265e+05 0.602 0.547350
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 7.379e+04 5.041e+05 0.146 0.883634
NameFarm Workforce Retention Credit 2.727e+04 5.352e+05 0.051 0.959361
NameFarmers' School Tax Credit 1.287e+05 5.133e+05 0.251 0.802102
NameHire a Veteran Credit 1.459e+05 8.253e+05 0.177 0.859709
NameHistoric Properties Rehabilitation Credit 1.875e+06 4.841e+05 3.874 0.000110 ***
NameInvestment Tax Credit 7.182e+05 4.129e+05 1.739 0.082075 .
NameInvestment Tax Credit for the Financial Services Industry 6.339e+05 5.757e+05 1.101 0.270913
NameLife Sciences Research & Development Tax Credit -2.243e+03 9.007e+05 -0.002 0.998014
NameLong-Term Care Insurance Credit 1.385e+05 4.200e+05 0.330 0.741537
NameLow-Income Housing Credit 1.642e+06 5.494e+05 2.988 0.002833 **
NameMinimum Wage Reimbursement Credit 9.624e+04 4.381e+05 0.220 0.826159
NameMortgage Servicing Tax Credit 2.077e+05 6.462e+05 0.321 0.747934
NameNew York Youth Jobs Program Tax Credit 1.953e+05 4.273e+05 0.457 0.647604
NameQETC Capital Tax Credit 2.986e+05 5.868e+05 0.509 0.610823
NameQETC Employment Credit 8.982e+04 4.465e+05 0.201 0.840607
NameQETC Facilities, Operations, and Training Credit 3.477e+05 6.711e+05 0.518 0.604441
NameReal Property Tax Relief Credit for Manufacturing 1.520e+05 4.423e+05 0.344 0.731171
NameSpecial Additional Mortgage Recording Tax Credit 2.609e+05 4.432e+05 0.589 0.556155
NameSTART-UP NY Tax Elimination Credit 1.703e+04 4.750e+05 0.036 0.971411
GroupAdministrative and Support and Waste Management and Remediation Services -2.280e+03 2.519e+05 -0.009 0.992777
GroupAdministrative/Support/Waste Management/Remediation Services -3.173e+03 2.798e+05 -0.011 0.990952
GroupAgriculture, Forestry, Fishing and Hunting 7.269e+04 2.330e+05 0.312 0.755097
GroupArts, Entertainment, and Recreation 5.866e+05 2.317e+05 2.532 0.011408 *
GroupConstruction -1.699e+04 2.171e+05 -0.078 0.937621
GroupEducational Services 5.569e+04 2.948e+05 0.189 0.850176
GroupFinance and Insurance 2.139e+05 2.034e+05 1.051 0.293237
GroupHealth Care and Social Assistance 1.300e+04 2.290e+05 0.057 0.954731
GroupInformation -2.433e+05 2.178e+05 -1.117 0.263917
GroupManagement of Companies and Enterprises 3.355e+05 1.949e+05 1.721 0.085332 .
GroupManufacturing 6.786e+05 2.028e+05 3.346 0.000831 ***
GroupMining -1.561e+04 3.593e+05 -0.043 0.965352
GroupMining, Quarrying, and Oil and Gas Extraction 1.084e+05 3.011e+05 0.360 0.718901
GroupOther Services (except Public Administration) -6.462e+04 2.217e+05 -0.291 0.770697
GroupProfessional, Scientific, and Technical Services 3.736e+05 2.085e+05 1.792 0.073214 .
GroupReal Estate and Rental and Leasing 9.506e+04 2.044e+05 0.465 0.641882
GroupRetail Trade 4.746e+04 2.036e+05 0.233 0.815756
GroupTransportation and Warehousing -1.866e+04 2.300e+05 -0.081 0.935321
GroupUtilities 5.308e+05 2.633e+05 2.016 0.043917 *
GroupWholesale Trade 6.106e+04 2.081e+05 0.293 0.769261
Num -9.657e+02 4.272e+02 -2.260 0.023889 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1612000 on 2411 degrees of freedom
Multiple R-squared: 0.4674, Adjusted R-squared: 0.4533
F-statistic: 33.06 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
industry.model <- lm(sat.formula, data = industry_cleaned)
Selecting Specific Diagnostic plots for linear models
plot(income.model, which = 1)
plot(income.model, which = 2)
plot(income.model, which = 3)
plot(income.model, which = 5)
Correcting violation of Normality in previous model with BoxCox transform
bc_func <- function (lm.cleaned, lambda.range){
bc = boxCox(lm.cleaned, lambda = lambda.range)
#Extracting the best lambda value.
return(bc$x[which(bc$y == max(bc$y))])
}
#Income Group Dataset
income.lambda.bc = bc_func(income.model, seq(-0.2, 0.2, 1/10))
income.lambda.bc
[1] -0.01414141
#Industry Group Dataset
industry.lambda.bc = bc_func(industry.model, seq(-0.2, 0.2, 1/10))
industry.lambda.bc
[1] -0.03434343
bc_transform <- function(df, lambda.bc){
return (df %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg))) #took out field Amount
}
#Income Group Dataset
income_cleaned_bc <- bc_transform(income_cleaned, income.lambda.bc)
income.model.bc = lm(Avg.bc ~ ., data = income_cleaned_bc)
#Industry Group Dataset
industry_cleaned_bc <- bc_transform(industry_cleaned, industry.lambda.bc)
industry.model.bc = lm(Avg.bc ~ ., data = industry_cleaned_bc)
Testing out appending a newly created dataframe to a list of dataframes with a Name for Shiny app
# income_cleaned_bc
# all.data <- list('income_cleaned' = income_cleaned, 'industry_cleaned' = industry_cleaned)
# all.data <- append(all.data, list('income_cleaned_bc' = income_cleaned_bc))
# all.data[['income_cleaned_bc']]
Testing out bc_func for migration to Shiny App global.R file
bc_funct <- function (df, lm.cleaned, lambda.range){
bc = boxCox(lm.cleaned, lambda = lambda.range)
lambda.bc = bc$x[which(bc$y == max(bc$y))]
return(df %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg)))
}
bc_funct(income_cleaned, income.model, seq(-0.2, 0.2, 1/10))
Checking linear regression assumptions for the transformed data.
sat.formula.bc <- Avg.bc ~ .
sat.field.bc <- 'Avg.bc'
#Income
sat.model.summary(income_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.99696, p-value = 0.000782
[1] "kurtosis"
[1] -0.2553313
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.6597 -0.5738 -0.0113 0.5668 4.4826
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.634e+01 1.456e+01 -2.496 0.012643 *
Year 2.288e-02 7.214e-03 3.171 0.001543 **
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -8.643e-01 3.281e-01 -2.634 0.008510 **
NameAlternative Minimum Tax Credit -2.052e+00 2.577e-01 -7.962 2.91e-15 ***
NameBeer Production Credit 5.687e-01 3.407e-01 1.669 0.095234 .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.715e+00 2.630e-01 6.521 8.99e-11 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.676e+00 3.754e-01 7.130 1.43e-12 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.331e+00 2.622e-01 5.076 4.24e-07 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 5.458e-02 2.620e-01 0.208 0.834991
NameClean Heating Fuel Credit -3.086e+00 2.705e-01 -11.409 < 2e-16 ***
NameConservation Easement Tax Credit -2.137e+00 2.810e-01 -7.606 4.45e-14 ***
NameCredit for Employment of Persons with Disabilities -3.118e+00 2.956e-01 -10.547 < 2e-16 ***
NameCredit for Purchase of an Automated External Defibrillator -2.938e+00 2.560e-01 -11.473 < 2e-16 ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -5.673e-01 4.698e-01 -1.207 0.227430
NameEmpire State Apprentice Tax Credit -2.207e+00 4.024e-01 -5.485 4.71e-08 ***
NameEmpire State Commercial Production Credit 6.075e-02 3.410e-01 0.178 0.858634
NameEmpire State Film Post Production Credit 1.101e+00 2.769e-01 3.977 7.25e-05 ***
NameEmpire State Film Production Credit 2.853e+00 2.632e-01 10.838 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 2.508e-01 4.688e-01 0.535 0.592803
NameExcelsior Jobs Program Credit 4.651e-01 2.521e-01 1.845 0.065178 .
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 8.876e-01 2.341e-01 3.792 0.000154 ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 1.775e-01 2.433e-01 0.730 0.465701
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.220e+00 2.440e-01 5.000 6.26e-07 ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 1.182e-01 2.522e-01 0.469 0.639260
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -8.497e-01 2.479e-01 -3.427 0.000623 ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -1.358e+00 2.756e-01 -4.928 9.02e-07 ***
NameFarm Workforce Retention Credit -1.612e+00 3.394e-01 -4.749 2.20e-06 ***
NameFarmers' School Tax Credit -1.311e+00 2.696e-01 -4.863 1.26e-06 ***
NameHire a Veteran Credit -2.915e+00 4.706e-01 -6.195 7.15e-10 ***
NameHistoric Properties Rehabilitation Credit 1.918e+00 2.786e-01 6.886 7.81e-12 ***
NameIndustrial or Manufacturing Business Tax Credit -1.718e+00 2.836e-01 -6.059 1.66e-09 ***
NameInvestment Tax Credit 7.147e-02 2.349e-01 0.304 0.760985
NameInvestment Tax Credit for the Financial Services Industry 3.176e-01 2.662e-01 1.193 0.232870
NameLife Sciences Research & Development Tax Credit 7.033e-01 5.543e-01 1.269 0.204728
NameLong-Term Care Insurance Credit -2.863e+00 2.590e-01 -11.051 < 2e-16 ***
NameLow-Income Housing Credit -9.063e-01 2.940e-01 -3.083 0.002080 **
NameMinimum Wage Reimbursement Credit -1.241e+00 2.656e-01 -4.674 3.17e-06 ***
NameMortgage Servicing Tax Credit -9.378e-01 2.865e-01 -3.273 0.001084 **
NameNew York Youth Jobs Program Tax Credit -1.330e+00 2.478e-01 -5.367 9.01e-08 ***
NameQETC Capital Tax Credit 4.561e-01 3.661e-01 1.246 0.212956
NameQETC Employment Credit -5.888e-01 2.709e-01 -2.174 0.029855 *
NameQETC Facilities, Operations, and Training Credit 5.799e-01 5.065e-01 1.145 0.252462
NameReal Property Tax Relief Credit for Manufacturing -1.518e+00 2.550e-01 -5.956 3.09e-09 ***
NameSpecial Additional Mortgage Recording Tax Credit -2.309e-01 2.441e-01 -0.946 0.344374
NameSTART-UP NY Tax Elimination Credit -2.193e+00 2.985e-01 -7.345 3.05e-13 ***
Group1,000,000 - 24,999,999 1.090e+00 9.246e-02 11.785 < 2e-16 ***
Group100,000 - 499,999 3.590e-01 9.451e-02 3.798 0.000150 ***
Group100,000,000 - 499,999,999 1.685e+00 1.040e-01 16.202 < 2e-16 ***
Group25,000,000 - 49,999,999 1.325e+00 1.115e-01 11.883 < 2e-16 ***
Group50,000,000 - 99,999,999 1.473e+00 1.118e-01 13.172 < 2e-16 ***
Group500,000 - 999,999 6.032e-01 1.025e-01 5.886 4.67e-09 ***
Group500,000,000 - and over 2.332e+00 1.016e-01 22.953 < 2e-16 ***
GroupZero or Net Loss 9.934e-01 8.817e-02 11.267 < 2e-16 ***
Num -1.533e-03 2.263e-04 -6.775 1.66e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.022 on 1867 degrees of freedom
Multiple R-squared: 0.7368, Adjusted R-squared: 0.7293
F-statistic: 98.61 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
#Industry
sat.model.summary(industry_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.9902, p-value = 5.826e-12
[1] "kurtosis"
[1] -0.4869026
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.2888 -0.4697 0.0183 0.4928 3.9068
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.095e+01 1.107e+01 -5.505 4.09e-08 ***
Year 3.440e-02 5.488e-03 6.269 4.30e-10 ***
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 2.665e-01 2.733e-01 0.975 0.329737
NameAlternative Minimum Tax Credit -2.463e+00 2.272e-01 -10.839 < 2e-16 ***
NameBeer Production Credit 6.334e-01 3.381e-01 1.873 0.061126 .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 2.198e+00 2.415e-01 9.100 < 2e-16 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.727e+00 3.930e-01 6.939 5.06e-12 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.586e+00 2.581e-01 6.148 9.17e-10 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 9.695e-01 2.570e-01 3.772 0.000166 ***
NameClean Heating Fuel Credit -2.544e+00 2.481e-01 -10.252 < 2e-16 ***
NameConservation Easement Tax Credit -1.123e+00 2.538e-01 -4.423 1.02e-05 ***
NameCredit for Employment of Persons with Disabilities -1.245e+00 2.737e-01 -4.549 5.67e-06 ***
NameCredit for Purchase of an Automated External Defibrillator -1.535e+00 2.411e-01 -6.368 2.29e-10 ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -8.171e-02 4.474e-01 -0.183 0.855119
NameEmpire State Apprentice Tax Credit -8.683e-01 5.464e-01 -1.589 0.112125
NameEmpire State Commercial Production Credit 6.235e-01 3.333e-01 1.871 0.061481 .
NameEmpire State Film Post Production Credit 1.439e+00 2.821e-01 5.100 3.66e-07 ***
NameEmpire State Film Production Credit 3.232e+00 2.584e-01 12.509 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 1.012e+00 4.865e-01 2.080 0.037608 *
NameExcelsior Jobs Program Credit 1.643e+00 2.372e-01 6.929 5.42e-12 ***
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.310e+00 2.332e-01 5.617 2.17e-08 ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 7.127e-01 2.275e-01 3.133 0.001750 **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.707e+00 2.264e-01 7.540 6.61e-14 ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 9.777e-01 2.328e-01 4.200 2.77e-05 ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 1.609e-01 2.303e-01 0.699 0.484921
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -5.241e-01 2.722e-01 -1.925 0.054320 .
NameFarm Workforce Retention Credit -8.623e-01 2.890e-01 -2.984 0.002876 **
NameFarmers' School Tax Credit -1.158e+00 2.771e-01 -4.179 3.03e-05 ***
NameHire a Veteran Credit -1.040e+00 4.456e-01 -2.335 0.019638 *
NameHistoric Properties Rehabilitation Credit 2.397e+00 2.614e-01 9.172 < 2e-16 ***
NameInvestment Tax Credit 6.981e-01 2.230e-01 3.131 0.001763 **
NameInvestment Tax Credit for the Financial Services Industry 1.560e+00 3.108e-01 5.019 5.59e-07 ***
NameLife Sciences Research & Development Tax Credit 9.057e-01 4.864e-01 1.862 0.062697 .
NameLong-Term Care Insurance Credit -1.867e+00 2.268e-01 -8.231 3.01e-16 ***
NameLow-Income Housing Credit 1.331e+00 2.967e-01 4.487 7.57e-06 ***
NameMinimum Wage Reimbursement Credit -9.161e-01 2.366e-01 -3.872 0.000111 ***
NameMortgage Servicing Tax Credit 8.932e-01 3.489e-01 2.560 0.010534 *
NameNew York Youth Jobs Program Tax Credit 1.263e-01 2.307e-01 0.548 0.584069
NameQETC Capital Tax Credit 1.162e+00 3.168e-01 3.667 0.000251 ***
NameQETC Employment Credit -3.048e-01 2.411e-01 -1.264 0.206249
NameQETC Facilities, Operations, and Training Credit 1.147e+00 3.624e-01 3.165 0.001571 **
NameReal Property Tax Relief Credit for Manufacturing -6.559e-01 2.388e-01 -2.746 0.006077 **
NameSpecial Additional Mortgage Recording Tax Credit 7.505e-01 2.393e-01 3.136 0.001736 **
NameSTART-UP NY Tax Elimination Credit -1.834e+00 2.565e-01 -7.151 1.14e-12 ***
GroupAdministrative and Support and Waste Management and Remediation Services 2.878e-01 1.360e-01 2.116 0.034417 *
GroupAdministrative/Support/Waste Management/Remediation Services 1.675e-01 1.511e-01 1.109 0.267612
GroupAgriculture, Forestry, Fishing and Hunting -1.132e-01 1.258e-01 -0.899 0.368486
GroupArts, Entertainment, and Recreation 5.595e-01 1.251e-01 4.473 8.09e-06 ***
GroupConstruction -4.299e-02 1.172e-01 -0.367 0.713860
GroupEducational Services 2.253e-01 1.592e-01 1.415 0.157160
GroupFinance and Insurance 5.350e-01 1.099e-01 4.870 1.19e-06 ***
GroupHealth Care and Social Assistance -6.777e-02 1.237e-01 -0.548 0.583720
GroupInformation 7.098e-01 1.176e-01 6.037 1.81e-09 ***
GroupManagement of Companies and Enterprises 6.672e-01 1.052e-01 6.340 2.73e-10 ***
GroupManufacturing 4.608e-01 1.095e-01 4.208 2.67e-05 ***
GroupMining 2.515e-01 1.940e-01 1.296 0.194977
GroupMining, Quarrying, and Oil and Gas Extraction 3.869e-01 1.626e-01 2.379 0.017421 *
GroupOther Services (except Public Administration) -1.550e-01 1.197e-01 -1.295 0.195482
GroupProfessional, Scientific, and Technical Services 4.875e-01 1.126e-01 4.331 1.55e-05 ***
GroupReal Estate and Rental and Leasing 1.461e-01 1.104e-01 1.324 0.185596
GroupRetail Trade 3.754e-01 1.100e-01 3.414 0.000650 ***
GroupTransportation and Warehousing 2.125e-01 1.242e-01 1.711 0.087179 .
GroupUtilities 6.972e-01 1.422e-01 4.904 1.00e-06 ***
GroupWholesale Trade 4.367e-01 1.124e-01 3.886 0.000105 ***
Num 4.542e-04 2.307e-04 1.969 0.049082 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8703 on 2411 degrees of freedom
Multiple R-squared: 0.7631, Adjusted R-squared: 0.7569
F-statistic: 121.4 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
avPlots(income.model.bc)
NA
NA
NA
NA
NA
BIC comparison before and after BoxCox transform
BIC(income.model.bc, income.model)
BIC(industry.model.bc, industry.model)
Stepwise Regression on Income_cat_bc (boxcox transformed dataset)
#creating dummy variable columns for stepwise
dummy_func <- function (df){
x = model.matrix(Avg.bc ~., df)[, -1]
dummy_bc = as.data.frame(x) %>% mutate(Avg.bc = df$Avg.bc)
#colnames(dummy_bc) <- str_replace_all(colnames(dummy_bc), "-|'|/| |,|�|&" , '_')
colnames(dummy_bc) <- str_replace_all(colnames(dummy_bc), "[-'/ ,�&()`]" , '_')
return(dummy_bc)
}
Cleaning column names further so stepwise regression doesn’t present any errors
#Income Group Dataset
income.dummy.bc <- dummy_func(income_cleaned_bc)
#colnames(income.dummy.bc)[37] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(income.dummy.bc)
#Industry Group Dataset
industry.dummy.bc <- dummy_func(industry_cleaned_bc)
#colnames(industry.dummy.bc)[35] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(industry.dummy.bc)
Stepwise regression using BIC as the criteria (k = log(n)).
#Creating Stepwise Models
bcs = list(income = income.dummy.bc, industry = industry.dummy.bc)
model.fulls = list(income = lm(Avg.bc ~ ., data = income.dummy.bc), industry = lm(Avg.bc ~ ., data = industry.dummy.bc)) #All variables
model.emptys = list(income = lm(Avg.bc ~ 1, data = income.dummy.bc), industry = lm(Avg.bc ~ 1, data = industry.dummy.bc)) #intercept only
k = c('income', 'industry')
forwardBIC = list(income = NULL, industry = NULL)
backwardBIC = list(income = NULL, industry = NULL)
for (i in k){
bc = bcs[[i]]
scope = list(lower = formula(model.emptys[[i]]), upper = formula(model.fulls[[i]]))
n_obs = bc %>% count() %>% dplyr::first()
forwardBIC[[i]] = step(model.emptys[[i]], scope, direction = "forward", k = log(n_obs))
backwardBIC[[i]] = step(model.fulls[[i]], scope, direction = "backward", k = log(n_obs))
}
Selecting Best Formula per Dataset from Stepwise Regressions
bic_func(forwardBIC[['income']])
[1] "Adjusted R Squared:"
Error in summary(BIC.model) : object 'forwardBIC' not found
Best Model Selection from Stepwise
#The Best Models selected for both income and industry were forwardBIC.
#Income Group Dataset
income.best.formula <- forwardBIC[['income']]$call[[2]]
income.best.formula
#Industry Group Dataset
industry.best.formula <- forwardBIC[['industry']]$call[[2]]
industry.best.formula
Splitting data up into test data and training data (test data is for year 2019, training is the rest)
test_train_split <- function(dummy_bc, best.formula) {
# data.test <- dummy_bc %>% filter(Year == 2019)
# data.train <- dummy_bc %>% filter(Year != 2019)
X <- model.matrix(best.formula, data = dummy_bc)[,-1]
y <- as.matrix(dummy_bc %>% select(all.vars(best.formula)[1]))
set.seed(0)
train.i = sample(1:nrow(dummy_bc), 0.8*nrow(dummy_bc), replace = F)
#train
X.train <- X[train.i,]
y.train <- y[train.i,]
#test
X.test <- X[-train.i,]
y.test <- y[-train.i,]
data.train <- as.data.frame(cbind(y.train, X.train))
data.test <- as.data.frame(cbind(y.test, X.test))
colnames(data.train)[1] = all.vars(best.formula)[1]
colnames(data.test)[1] = all.vars(best.formula)[1]
return (list('X.train' = X.train, 'y.train' = y.train, 'X.test' = X.test, 'y.test' = y.test, 'data.train' = data.train, 'data.test' = data.test))
}
test_train_split_old <- function(dummy_bc, best.formula) {
# data.test <- dummy_bc %>% filter(Year == 2019)
# data.train <- dummy_bc %>% filter(Year != 2019)
set.seed(0)
train.i = sample(1:nrow(dummy_bc), 0.8*nrow(dummy_bc), replace = F)
data.train <- dummy_bc %>% slice(train.i)
data.test <- dummy_bc %>% slice(-train.i)
#train
#X.train <- as.matrix(data.train %>% select(-Avg.bc))
X.train <- model.matrix(best.formula, data = data.train)[,-1]
y.train <- as.matrix(data.train %>% select(all.vars(best.formula)[1]))
#test
#X.test <- as.matrix(data.test %>% select(-Avg.bc))
X.test <- model.matrix(best.formula, data = data.test)[,-1]
y.test <- as.matrix(data.test %>% select(all.vars(best.formula)[1]))
#return (list('X.train' = X.train, 'y.train' = y.train, 'X.test' = X.test, 'y.test' = y.test, 'data.train' = data.train, 'data.test' = data.test))
}
#Income
income.data.split.sat <- test_train_split(income.dummy.bc, Avg.bc ~ .)
income.data.split <- test_train_split(income.dummy.bc, income.best.formula)
#Industry
industry.data.split.sat <- test_train_split(industry.dummy.bc, Avg.bc ~ .)
industry.data.split <- test_train_split(industry.dummy.bc, industry.best.formula)
test_train_split(income_cleaned, sat.formula)
Lasso regression for comparison to Forward Stepwise
# calc_MSE <- function (model, x.test, y.test){
# y.predict <- predict(model, newdata = x.test)
# return(mean((y.predict - y.test)^2))
# }
# xy.splits <- list('income' = income.data.split,
# 'industry' = industry.data.split,
# 'income.sat' = income.data.split.sat,
# 'industry.sat' = industry.data.split.sat)
#
# lasso.list <- c('income', 'industry', 'income.sat', 'industry.sat')
# baseline.list <- c('income.sat', 'industry.sat', 'income.sat', 'industry.sat')
# ridge.alpha <- 0
#
# for (a in 1:4){
# i = lasso.list[a]
# b = baseline.list[a]
# X.train <- xy.splits[[i]][['X.train']]
# y.train <- xy.splits[[i]][['y.train']]
# X.test <- xy.splits[[i]][['X.test']]
# y.test <- xy.splits[[i]][['y.test']]
# X.test.sat <- xy.splits[[b]][['X.test']]
# y.test.sat <- xy.splits[[b]][['y.test']]
# data.train <- xy.splits[[i]][['data.train']]
# data.test <- xy.splits[[i]][['data.test']]
#
# #create lambda grid
# lambda.grid = 10^seq(2, -5, length = 100)
#
# #create lasso models with lambda.grid
# lasso.models = glmnet(X.train, y.train, alpha = ridge.alpha, lambda = lambda.grid)
#
# #visualize coefficient shrinkage
# plot(lasso.models, xvar = "lambda", label = TRUE, main = paste("Lasso Regression:", i))
#
# #Cross Validation to find best lambda
# set.seed(0)
# cv.lasso.models <- cv.glmnet(X.train, y.train, alpha = ridge.alpha, lambda = lambda.grid, nfolds = 10)
#
# #visualize cross validation for lambda that minimizes the mean squared error.
# plot(cv.lasso.models, main = paste("Lasso Regression:", i))
#
# #Checking the best lambda
# log(cv.lasso.models$lambda.min)
# best.lambda <- cv.lasso.models$lambda.min
# print(paste(i, ' best.lambda:', best.lambda))
# # best lambda with all the variables was found to be 0.0006892612
# # best lambda with only the bwdBIC coefficients included was found to be 0.0003053856
#
# #looking at the lasso coefficients for the best.lambda
# best.lambda.coeff <- predict(lasso.models, s = best.lambda, type = "coefficients")
# print('Number of Coefficients:')
# print(dim(best.lambda.coeff)[1])
#
# #fitting a model with the best lambda found to be 0.000689 and using it to make predictions for the testing data.
# lasso.best.lambda.train.pred <- predict(lasso.models, s = best.lambda, newx = X.test)
# lasso.best.lambda.train.pred
#
# #checking MSE
# MSE.lasso <- mean((lasso.best.lambda.train.pred - y.test)^2)
# sat.model.bc <- lm(Avg.bc ~., data = data.train)
#
#
# temp.df <- as.data.frame(X.test.sat) #temp fix
# colnames(temp.df) <- str_replace_all(colnames(temp.df), "[`]", '')
#
# y.predict <- predict(sat.model.bc, newdata = temp.df)
# MSE.sat <- mean((y.predict - y.test.sat)^2)
#
# print(paste(i, ' Lasso MSE: ', MSE.lasso, ' ', b, ' Saturated MSE: ', MSE.sat))
#
# metrics <- eval_results(y.test, lasso.best.lambda.train.pred, data.test)
# print(metrics)
# print('********************************')
# }
Function to show metrics (R^2 and MSE) for Regularization (Ridge/Lasso)
regularization_func <- function (data, alpha, name){
X.train <- data[['X.train']]
y.train <- data[['y.train']]
X.test <- data [['X.test']]
y.test <- data[['y.test']]
data.test <- data[['data.test']]
# print(colnames(X.train))
# print(colnames(X.test))
# print(setdiff(colnames(X.train), colnames(X.test)))
#create lambda grid
lambda.grid = 10^seq(10, -10, length = 100)
#create lasso models with lambda.grid
lasso.models = glmnet(X.train, y.train, alpha = alpha, lambda = lambda.grid)
#visualize coefficient shrinkage
# plot(lasso.models, xvar = "lambda", label = TRUE, main = paste("Lasso Regression:", i))
#Cross Validation to find best lambda
set.seed(0)
cv.lasso.models <- cv.glmnet(X.train, y.train, alpha = alpha, lambda = lambda.grid, nfolds = 10)
#visualize cross validation for lambda that minimizes the mean squared error.
plot(cv.lasso.models, main = paste("Alpha:", alpha, "Regression:", name))
#Checking the best lambda
# log(cv.lasso.models$lambda.min)
# best.lambda <- cv.lasso.models$lambda.min
# print(paste(i, ' best.lambda:', best.lambda))
# best lambda with all the variables was found to be 0.0006892612
# best lambda with only the bwdBIC coefficients included was found to be 0.0003053856
#looking at the lasso coefficients for the best.lambda
# best.lambda.coeff <- predict(lasso.models, s = cv.lasso.models$lambda.min, type = "coefficients")
# print('Number of Coefficients:')
# print(dim(best.lambda.coeff)[1])
#fitting a model with the best lambda found to be 0.000689 and using it to make predictions for the testing data.
lasso.best.lambda.train.pred <- predict(lasso.models, s = cv.lasso.models$lambda.min, newx = X.test)
lasso.best.lambda.train.pred
#checking MSE
MSE.lasso <- mean((lasso.best.lambda.train.pred - y.test)^2)
print(paste('Dimensions of the Alpha:', alpha, ' Regression Coefficients for:', name))
print(dim(coef(lasso.models))[1])
p = dim(coef(lasso.models))[1]
return(eval_results(y.test, lasso.best.lambda.train.pred, data.test, p)['Rsquare'])
}
regularization_func(all.splits[['income_cleaned']], 0, 'income_cleaned')
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned"
[1] 54
Rsquare
-0.01019062
regularization_func(all.splits[['income_cleaned_bc']], 0, 'income_cleaned_bc')
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned_bc"
[1] 54
Rsquare
0.6750177
regularization_func(all.splits[['income.data.split.sat']], 0, 'income.data.split.sat')
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income.data.split.sat"
[1] 54
Rsquare
0.6750177
#all.dfs <- list(income_cleaned, income_cleaned_bc, income.dummy.bc)
formulas <- list(income_cleaned = sat.formula,
income_cleaned_bc = sat.formula.bc,
income.data.split.sat = sat.formula.bc,
income.data.split.best = income.best.formula,
industry_cleaned = sat.formula,
industry_cleaned_bc = sat.formula.bc,
industry.data.split.sat = sat.formula.bc,
industry.data.split.best = industry.best.formula)
all.splits <- list(
'income_cleaned' = test_train_split(income_cleaned, formulas[['income_cleaned']]),
'income_cleaned_bc' = test_train_split(income_cleaned_bc, formulas[['income_cleaned_bc']]),
'income.data.split.sat' = test_train_split(income.dummy.bc, formulas[['income.data.split.sat']]),
'income.data.split.best' = test_train_split(income.dummy.bc, formulas[['income.data.split.best']]),
'industry_cleaned' = test_train_split(industry_cleaned, formulas[['industry_cleaned']]),
'industry_cleaned_bc' = test_train_split(industry_cleaned_bc, formulas[['industry_cleaned_bc']]),
'industry.data.split.sat' = test_train_split(industry.dummy.bc, formulas[['industry.data.split.sat']]),
'industry.data.split.best' = test_train_split(industry.dummy.bc, formulas[['industry.data.split.best']])
)
#For loops initialization
no.reg.r2 <- c()
lasso.reg.r2 <- c()
ridge.reg.r2 <- c()
for (i in names(all.splits)) {
#no regularization
m = lm(formulas[[i]], all.splits[[i]][['data.train']])
#no.reg.r2 <- c(no.reg.r2, summary(m)$adj.r.squared)
y.predict = predict(m, newdata = as.data.frame(all.splits[[i]][['X.test']]))
adj.R2 <- eval_results(all.splits[[i]][['y.test']], y.predict, all.splits[[i]][['data.test']], length(coef(m)))['Rsquare']
no.reg.r2 <- c(no.reg.r2, adj.R2)
#lasso regularization
lasso.reg.r2 <- c(lasso.reg.r2, regularization_func(all.splits[[i]], 1, i))
#ridge regularization
ridge.reg.r2 <- c(ridge.reg.r2, regularization_func(all.splits[[i]], 0, i))
}
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income_cleaned"
[1] 54
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income_cleaned_bc"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned_bc"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income.data.split.sat"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income.data.split.sat"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income.data.split.best"
[1] 41
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income.data.split.best"
[1] 41
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry_cleaned"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry_cleaned"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry_cleaned_bc"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry_cleaned_bc"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry.data.split.sat"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry.data.split.sat"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry.data.split.best"
[1] 42
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry.data.split.best"
[1] 42
df.rsquare <- as.data.frame(cbind('noReg' = no.reg.r2, 'lassoReg' = lasso.reg.r2, 'ridgeReg' = ridge.reg.r2))
rownames(df.rsquare) = names(all.splits)
df.rsquare
# sat.model.bc <- lm(Avg.bc ~., data = xy.splits.sat[['industry']][['data.train']])
#
# temp.df <- as.data.frame(xy.splits.sat[['industry']][['X.test']])
#
# colnames(temp.df)[57]
#
# colnames(temp.df) <- str_replace_all(colnames(temp.df), "[`]", '')
# colnames(temp.df)
#
# y.predict <- predict(sat.model.bc, newdata = temp.df)
# MSE.sat <- mean((y.predict - xy.splits.sat[['industry']][['y.test']])^2)
#
# print(paste(i, ' Lasso MSE: ', MSE.lasso, ' Saturated MSE: ', MSE.sat))
#
# summary(sat.model.bc)$coefficients
# as.data.frame(xy.splits.sat[['industry']][['X.test']]) %>% select(starts_with('`GroupOth'))
# Calculate R squared from true values and predictions
eval_results <- function(true, predicted, df, p) {
n = nrow(df)
adj.RSS <- sum((predicted - true)^2)/(n-p-1)
adj.TSS <- sum((true - mean(true))^2)/(n-1)
adj.R_square <- 1 - adj.RSS / adj.TSS
#RMSE = sqrt(RSS/n)
return(c(Rsquare = adj.R_square))
}
#formula for adjusted R^2 found in following link.
#https://www.graphpad.com/guides/prism/latest/curve-fitting/reg_adjusted-r-squared.htm
Additional Calculations
as.data.frame(lasso.best.lambda.train.pred) %>% mutate(Avg_in_dollars = (lasso.best.lambda.train.pred*lambda.bc+1)^(1/lambda.bc))
income.sat.data.split <- test_train_split(income.dummy.bc, Avg.bc ~ .)
income.sat.data.split[['X.train']]
sat.model.bc <- lm(Avg.bc ~., data = as.data.frame(cbind(income.sat.data.split[['X.train']], income.sat.data.split[['y.train']])))
summary(sat.model.bc)
calc_MSE(sat.model.bc, as.data.frame(income.sat.data.split[['X.test']]), income.sat.data.split[['y.test']])
sat.y.predict <- predict(sat.model.bc, newdata = as.data.frame(income.sat.data.split[['X.test']]))
mean((sat.y.predict - income.sat.data.split[['y.test']])^2)